#!/usr/bin/python
#-*- coding:utf-8 -*-
#############################################################
#File Name: te-polymorphisms-analysis.py                    #
#Author: Elise van Bree & Mischa Lundberg                   #
#############################################################

# import necessary libraries
import argparse
import os
import sys
import pandas as pd
#from pyliftover import LiftOver
import numpy as np
import gc
import pysam
from collections import defaultdict
from Bio.Seq import Seq
from Bio import SeqIO
import subprocess
import xlsxwriter
import multiprocessing
from multiprocessing.pool import ThreadPool
import time




# FUNCTION to run all functions
def main(args):
    
    print "*** Starting script: " + str(time.asctime( time.localtime(time.time()) ))
    global to_delete
    to_delete = []
    command = "lastal --version"
    try:
        lastal_version = int(subprocess.check_output(command, shell=True).split()[1])
    except:
        lastal_version = -1
        print("LASTAL version error:", sys.exc_info()[0])
    print "*** Your LASTAL version: %s" %lastal_version
    chrom_sizes_dict = obtain_chrom_sizes(args.g)
    if "la_result" not in args.i:
        command = "tr \"\\r\" \"\\n\" < %s > %s; mv %s %s" %(args.i, "temp_args.i", "temp_args.i", args.i)
        subprocess.call(command, shell=True)
        if "(" in args.i:
            tmp_args_i = args.i
            tmp_args_i_2 = tmp_args_i.replace('(', '\(')
            args.i = tmp_args_i_2.replace(')', '\)')
            command = "mv %s %s" %(tmp_args_i, args.i)
            subprocess.call(command, shell=True)
            print "*** Renamed automatically argument -i (contained \"(\" and \")\" which needed to be escaped"
        padded_coordinates = mod_coordinates(args.i, int(args.p), chrom_sizes_dict)
        orig_coordinates = original_coordinates(args.i)
        print "*** Generating the TE_dict_ref dictionary from the fasta file: " + str(time.asctime( time.localtime(time.time()) ))
        TE_dict_ref = parseReference(args.r, orig_coordinates, padded_coordinates, args.p)

    print "*** Generating the WGS dictionary from the fasta file: " + str(time.asctime( time.localtime(time.time()) ))
    wgs_dict = SeqIO.to_dict(SeqIO.parse(open(args.s), "fasta"))
    
    # Prepare for multi threading
    if not (os.path.exists(args.s + '.tis')) and not (os.path.exists(args.s + '.prj')):
        build_last_db(args.s, args.t)
        # Look for upstream flank. If match: note the location where the end of the flank matches.
        # Look for downstream flank. If match: note the location where the beginning of the flank matches.
            # For the TE sequence: take sequence from end of upstream flank to beginning of downstream flank.
  

    # Make temporary file of flank sequence and run align_last with only this file.
    print "*** Searching for input elements in the sequencing data: " + str(time.asctime( time.localtime(time.time()) ))
    sequenceDict = {}
    total_sequence = pd.DataFrame()
    one_notAligned = pd.DataFrame()
    both_notAligned = pd.DataFrame()
    headerUp = ['ID', 'start', 'score-up', 'alnSize-up']
    headerDown = ['ID', 'stop', 'score-down', 'alnSize-down']
    combinedHeader = headerUp + headerDown[1:]
    dfMergedNaN = pd.DataFrame(columns = combinedHeader)
    
    if not (os.path.exists(args.s + '.tis')) and not (os.path.exists(args.s + '.prj')):
        sys.stderr.write('Warning: no lastdb index for %s\n, try to run lastdb index %s and rerun the script.' %(args.s,args.s))
        return []
    if "la_result" not in args.i:
        output_counter = TE_dict_ref
        if args.cores > 1:
            print "starting multiprocessing of alignment: " + str(time.asctime( time.localtime(time.time()) ))
            counter = 0
            resultList = []
            pool = ThreadPool(int(args.cores))
            # For each TE in the TE_dict_ref:
            for key in TE_dict_ref:
                #print "Creating poolworker " + str(counter) + "/" + str(len(TE_dict_ref)) + ": " + str(time.asctime( time.localtime(time.time()) )) + " for key: " + str(key)
                #print [counter, TE_dict_ref, int(args.e), args.s, key, wgs_dict, headerUp, headerDown, args.o, lastal_version]
                #subprocess.call('mpstat', shell=True)
                result = pool.apply_async(alignLocationparallel, [counter, TE_dict_ref, int(args.e), args.s, key, wgs_dict, headerUp, headerDown, args.o, lastal_version])
                resultList.append(result)
                counter += 1
            counter = 0
            print "Resultlist length: %s" %len(resultList)
            output_counter = resultList
            # for res in resultList:
                # currentResult = res.get()
                # #print "currentResult of Poolworker %s: \ndfMergedNaN-part: %s \none_notAligned-part: %s\ntotalSequence-part: %s" %(counter,currentResult[0][0], currentResult[1][0], currentResult[2][0])
                # print "dfMergedNaN: %s, one_notAligned: %s ,total_sequence: %s" %(dfMergedNaN.shape[0], one_notAligned.shape[0] ,total_sequence.shape[0])
                # if len(dfMergedNaN) == 0 and len(currentResult[0][0]) > 0:
                    # dfMergedNaN = currentResult[0][0]
                # elif len(currentResult[0][0]) > 0 and len(dfMergedNaN) > 0:
                    # dfMergedNaN = pd.concat([dfMergedNaN, currentResult[0][0]])
                # if len(one_notAligned) == 0 and len(currentResult[1][0]) > 0:
                    # one_notAligned = currentResult[1][0]
                # elif len(currentResult[1][0]) > 0 and len(one_notAligned) == 0:
                    # one_notAligned = pd.concat([one_notAligned, currentResult[1][0]])            
                # if len(total_sequence) == 0 and len(currentResult[2][0]) > 0:
                    # total_sequence = currentResult[2][0]
                # elif len(currentResult[2][0]) > 0 and len(total_sequence) == 0:
                    # total_sequence = total_sequence.append(currentResult[2][0], ignore_index=False)
                # counter += 1
            # pool.close()
            # print "Done with running all poolworkers"
            # #print total_sequence
            # saveAlignmentData(one_notAligned, dfMergedNaN, args.o)
            
        # else:
            # print "starting single core run: " + str(time.asctime( time.localtime(time.time()) ))
            # # For each TE in the TE_dict_ref:
            # #alignLocationparallel(run, TE_dict_ref, e, fa, key, wgs_dict, headerUp, headerDown, lastal_version)
            # #total_sequence = alignLocation(TE_dict_ref, args.s, int(args.e), args.o, wgs_dict, lastal_version)
            # for key in TE_dict_ref:
                # currentResult = alignLocationparallel(-1, TE_dict_ref, int(args.e), args.s, key, wgs_dict, headerUp, headerDown, args.o, lastal_version)
                # print "CurrentResult: %s" %currentResult
                # if len(dfMergedNaN) == 0:
                    # dfMergedNaN = currentResult[0][0]
                # elif len(currentResult[0][0]) != 0:
                    # dfMergedNaN = pd.concat([dfMergedNaN, currentResult[0][0]])
                # if len(one_notAligned) == 0:
                    # one_notAligned = currentResult[1][0]
                # elif len(currentResult[1][0]) != 0:
                    # one_notAligned = pd.concat([one_notAligned, currentResult[1][0]])            
                # if len(total_sequence):
                    # total_sequence = currentResult[2][0]
                # elif len(currentResult[2][0]) != 0:
                    # total_sequence = total_sequence.append(currentResult[2][0], ignore_index=False)
            # saveAlignmentData(one_notAligned, dfMergedNaN, args.o)
        
        for key in output_counter:
            if args.cores > 1:
                currentResult = key.get()
            else:
                currentResult = alignLocationparallel(-1, TE_dict_ref, int(args.e), args.s, key, wgs_dict, headerUp, headerDown, args.o, lastal_version)
            print "CurrentResult: %s" %currentResult
            if len(dfMergedNaN) == 0 and len(currentResult[0][0]) != 0:
                print "dfMergedNaN should be empty: %s \nand being filled with: %s" %(dfMergedNaN, currentResult[0][0])
                dfMergedNaN = currentResult[0][0]
            elif len(currentResult[0][0]) != 0:
                print "dfMergedNaN should NOT be empty: %s \nand being appended with: %s" %(dfMergedNaN, currentResult[0][0])
                dfMergedNaN = pd.concat([dfMergedNaN, currentResult[0][0]], ignore_index=True) #pd.concat([dfMergedNaN, currentResult[0][0]])
            if len(one_notAligned) == 0 and len(currentResult[1][0]) != 0:
                print "one_notAligned should be empty: %s \nand being filled with: %s" %(one_notAligned, currentResult[1][0])
                one_notAligned = currentResult[1][0]
            elif len(currentResult[1][0]) != 0:
                print "one_notAligned should NOT be empty: %s \nand being appended with: %s" %(one_notAligned, currentResult[1][0])
                one_notAligned = pd.concat([one_notAligned, currentResult[1][0]], ignore_index=True) #pd.concat([one_notAligned, currentResult[1][0]])            
            if len(total_sequence) == 0 and len(currentResult[2][0]) != 0:
                print "total_sequence should be empty: %s \nand being filled with: %s" %(total_sequence, currentResult[2][0])
                total_sequence = currentResult[2][0]
            elif len(currentResult[2][0]) != 0:
                print "total_sequence should NOT be empty: %s \nand being appended with: %s" %(total_sequence, currentResult[2][0])
                total_sequence = pd.concat([total_sequence, currentResult[2][0]], ignore_index=True) #total_sequence.append(currentResult[2][0], ignore_index=True)
        if args.cores > 1:
            pool.close()
            print "Done with running all poolworkers"
        saveAlignmentData(one_notAligned, dfMergedNaN, args.o)
        ### deactivated because of using old logic for single threading, activated in multi threading instead
        #saveAlignmentData(one_notAligned, dfMergedNaN, args.o)
        
        la_results =  total_sequence
        la_results.to_csv((args.o + "_la_results.csv"), sep='\t', encoding='utf-8')

    else:
        la_results = pd.read_csv(args.i, sep='\t', encoding='utf-8', header=0)
 
    if args.threshold > 0 and not args.f:
        args.f = True
        la_results = filter(la_results)
    
    if int(args.cores) > 1:
        print "starting multiprocessing of blasting: " + str(time.asctime( time.localtime(time.time()) ))        
        counter = 0
        resultList = []
        blast_results = []
        pool = ThreadPool(int(args.cores))
        # For each result entry from alignment step
        print "*** Comparing reference with database sequences: " + str(time.asctime( time.localtime(time.time()) ))

        for index, row in la_results.iterrows():
            #print "Creating poolworker " + str(counter) + "/" + str(len(la_results)) + ": " + str(time.asctime( time.localtime(time.time()) ))
            result = pool.apply_async(blastLocationParallel, [index, row, args.o, args.qc])
            resultList.append(result)
            counter += 1
        counter = 0

        for res in resultList:
            currentResult = res.get()
            if len(currentResult) > 1:
                blast_results.append(currentResult)
            counter += 1
        pool.close()
        #print "Done with running all poolworkers"
        
    else:
        #blast_results = blastLocation(la_results)
        blast_results = []
        for index, row in la_results.iterrows():
            current_blast_result = blastLocationParallel(index, row, args.o, args.qc)
            if len(current_blast_result) > 1:
                blast_results.append(current_blast_result)
        
    labels = ['TE ID', 'Query ID', 'Gaplist Ref', 'Gaplist Query', 'P/A poly', 'Ref length', 'Query length']
    results = pd.DataFrame.from_records(blast_results, columns=labels)
    table = analysis(results, args.o)
    for entry in to_delete:
        if os.path.isfile(entry):
            os.remove(entry)

# FUNCTION to make a directory of the size of each chromosome.
def obtain_chrom_sizes(chrom_sizes_file):
    print "*** Making directory with chromosome sizes: " + str(time.asctime( time.localtime(time.time()) ))
    inp = open(chrom_sizes_file, 'r')
    chrom_sizes_dict = {}
    global chr_in_chrom_sized_dict
    chr_in_chrom_sized_dict = False
    for line in inp:
        line_list = line.strip().split()
        chrom = line_list[0]
        if chr_in_chrom_sized_dict == False and "chr" in chrom:
            chr_in_chrom_sized_dict = True
        size = line_list[1]
        chrom_sizes_dict[chrom] = int(size)
    inp.close()
    return chrom_sizes_dict


# FUNCTION to add padding to start and stop coordinates of the TEs and check if they are valid.
def mod_coordinates(input_file, padding, chrom_sizes_dict):
    print "*** Adding padding to the coordinates of the input file: " + str(time.asctime( time.localtime(time.time()) ))
    f = open(input_file, 'r')
    tempdata = f.readlines()
    f.close()
    data = []

    # This part is there in case you mess up your bed file, as I usually do when modifying it...
    for line in tempdata:
        data += line.split('\r')
    coordinates_list = []

    for line in data:
        line_data = line.split('\t')
        if line_data[0] != '\n':
            chrom = line_data[0]
            start = float(line_data[1])
            stop = float(line_data[2])
            name = line_data[3]
            score = line_data[4]
            strand = line_data[5]
            if (not ("chr" in chrom) and chr_in_chrom_sized_dict) or ("chr" in chrom and not chr_in_chrom_sized_dict):
                error_msg = "ERROR ** Either input -i or -g contians the prefix \"chr\" but the other one doesn\'t. Please fix this by e.g. awk -F \"chr\" '{print $2}' file.containing.the.chr.prefix > file.nochr"
                #print ref_dict[1] #map(lambda d: d['name'], ref_dict)
                sys.exit(error_msg)
            chrom_end = chrom_sizes_dict[chrom]
            # Add padding
            start -= padding
            stop += padding

            # Check if coordinates are valid
            if start < 0:
                start = 0
            if stop > chrom_end:
                stop = chrom_end
            start = int(start)
            stop = int(stop)
            line_list =[chrom , str(start) , str(stop) , name , score , strand]
            coordinates_list.append(line_list)
    return coordinates_list


# FUNTION to get original coordinates in the same format as padded_coordinates.
def original_coordinates(input_file):
    print "*** Retrieving original coordinates of input file: " + str(time.asctime( time.localtime(time.time()) ))
    f = open(input_file, 'r')
    tempdata = f.readlines()
    f.close()
    data = []

    # This part is there in case you mess up your bed file, as I usually do when modifying it...
    for line in tempdata:
        data += line.split('\r')
    coordinates_list = []   
   
    for line in data:
        line_data = line.split('\t')
        if line_data[0] != '\n':
            chrom = line_data[0]
            start = line_data[1]
            stop = line_data[2]
            name = line_data[3]
            score = line_data[4]
            strand = line_data[5]
            line_list =[chrom , start , stop , name , score , strand]
            coordinates_list.append(line_list)
    return coordinates_list


# FUNCTION to retrieve for each element in the input: reference sequence, flank upstream, flank downstream.
def parseReference(reference, originalLocation, paddedLocation, padding):
    # TE identifier = chrom:start-stop_name_strand_padding
    print "*** Reading the reference into a dictionary: " + str(time.asctime( time.localtime(time.time()) ))
    TE_dict = {}
    ref_dict = SeqIO.to_dict(SeqIO.parse(open(reference), "fasta"))
    print "*** Retrieving the reference, upstream and downstream flanking sequence for the input: " + str(time.asctime( time.localtime(time.time()) ))
    for count in range(len(originalLocation)):
        TE_ID = ''
        chrom = originalLocation[count][0]
        start = originalLocation[count][1]
        stop = originalLocation[count][2]
        name = originalLocation[count][3]
        strand = originalLocation[count][5].split('\n')[0]
        start_padded = paddedLocation[count][1]
        stop_padded = paddedLocation[count][2]
        padding = str(padding)
        TE_ID = chrom + ':' + start + '-' + stop + '_' + name + '_' + strand + '_' + padding

        if chrom in ref_dict:
            sequence = ref_dict[chrom]
            TE_dict[TE_ID] = sequence[int(start):int(stop)] , sequence[int(start_padded):int(start)] , sequence[int(stop):int(stop_padded)]     
        else:
            error_msg = "ERROR ** "+chrom+" is not given in your reference file"
            #print ref_dict[1] #map(lambda d: d['name'], ref_dict)
            sys.exit(error_msg)
            
        # making dictionary with reference sequence, left flank, right flank.
        
    return TE_dict


# FUNCTION to run build_last_db() on the sequencing data.
    # Runs only if database does not exist yet (as sepcified in main).
        # Uses multiple threads if possible.
            # Using -e 4G, but this depends on the server...
def build_last_db(fa, threads):
    print "*** Building last database ***"
    print 'Take a coffee, this might take a while...' + '\n' 
    command = 'lastdb -P ' + str(threads) + ' ' + '-s 4G ' + fa + ' ' + fa 
    try:
        subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError:
        print "lastdb: I was installed here with multi-threading disabled." + '\n'
        print "Running lastdb without multi-threading." + '\n'
        subprocess.call(['lastdb', '-s', '4G', fa, fa])


# FUNCTION to write data from the alignment step into excel sheets.
def saveAlignmentData(one_notAligned, dfMergedNaN, resultsfile):


    excelOutput = resultsfile + '_not_aligned.xlsx'

    writer = pd.ExcelWriter(excelOutput, engine='xlsxwriter')

    workbook = writer.book
    italic = workbook.add_format({'italic': True})
    bold = workbook.add_format({'bold': True})
    
    one_notAligned.to_excel(writer, sheet_name = 'One-NotAligned', startrow = 5)
    worksheet = writer.sheets['One-NotAligned']
    worksheet.write_rich_string('A1', bold, 'TEs for which one of the flanking regions did not align.')
 
    dfMergedNaN.to_excel(writer, sheet_name = 'Merged-NotAligned', startrow = 5)
    worksheet = writer.sheets['Merged-NotAligned']
    worksheet.set_column('A:I', 10)
    worksheet.set_column('B:B', 45)
    worksheet.set_column('F:F', 45)
 
    worksheet.write_rich_string('A1', bold, 'TEs for which flank(s) did not align.')
    writer.save()

   
# FUNCTION to grep sequence of interest out of 'sequencing' file.
def alignLocation(TE_dict_ref, fa, e, resultsfile, wgs_dict, lastal_version):
    # Look for upstream flank. If match: note the location where the end of the flank matches.
    # Look for downstream flank. If match: note the location where the beginning of the flank matches.
    # For the TE sequence: take sequence from end of upstream flank to beginning of downstream flank.

    # Make temporary file of flank sequence and run align_last with only this file.
    print "*** Searching for input elements in the sequencing data"
    sequenceDict = {}
    total_sequence = pd.DataFrame()
    one_notAligned = pd.DataFrame()
    both_notAligned = pd.DataFrame()
    headerUp = ['ID', 'start', 'score-up', 'alnSize-up']
    headerDown = ['ID', 'stop', 'score-down', 'alnSize-down']

    if not (os.path.exists(fa + '.tis')) and not (os.path.exists(fa + '.prj')):
        sys.stderr.write('Warning: no lastdb index for %s\n' % fa)
        return []

    # For each TE in the TE_dict_ref:
    for key in TE_dict_ref:
        ftemp_name = 'temp-flank-file.fasta'
        with open(ftemp_name, "w") as ftemp:
            # Look for the upstream flank in the sequencing data.
            #print flankUp
            flankUp = TE_dict_ref[key][1].seq
            ftemp.write(">"+key+"\n"+str(flankUp)+"\n")

        dictUP = {}
        # filter lastal output to obtain best match. Using e to determine percentage of bases that must match.
        l = len(flankUp) * (float(e)/100)
        l = int(l)
        if lastal_version < 750:
            last_cmd = "lastal -e "+str(l) +" -f 0 "+fa+" "+ftemp_name
        else:
            last_cmd = "lastal -e "+str(l) +" -f tab "+fa+" "+ftemp_name
        output = subprocess.check_output(last_cmd, shell=True).split('\n')  
        # Don't print lines that start with "#" and the last line of the output.
        # Take the first coordinate of the alignment plus the length of the alignment and store it. This is where the sequence of the TE starts in the query.
        output = output[:-1]
        count = 0

        dfUpFiltered = pd.DataFrame(columns = headerUp)
        dfDownFiltered = pd.DataFrame(columns = headerDown)
        dictUP[count] = key, -1 , np.nan, np.nan
        dfUP = pd.DataFrame(columns = headerUp)

        for line in output:
            # print line.split('\t')
            if not '#' in line:
                line_data = line.split('\t')
                ID_line = line_data[1]
                beginTE = int(line_data[2]) + int(line_data[3])
                score = int(line_data[0])
                alnSize = int(line_data[3]) 
                dictUP[count] = ID_line, beginTE, score, alnSize
                count += 1   
        if len(dictUP) > 0:
            dfUP = pd.DataFrame.from_dict(dictUP).transpose()
            dfUP.columns = headerUp

        dfUPnan = pd.DataFrame(columns = headerUp)
        dfUPnotNaN = pd.DataFrame(columns = headerUp)
        dfUPnan = dfUP[dfUP['alnSize-up'].isnull()]
        dfUPnotNaN = dfUP[dfUP['alnSize-up'].notnull()]
        dfUpFiltered = pd.DataFrame(columns = headerUp)

        if len(dfUPnotNaN) > 0:
            dfUPnotNaN.columns = headerUp
            dfUpFiltered = dfUPnotNaN[dfUPnotNaN['score-up'] == dfUPnotNaN.groupby(['ID'])['score-up'].transform(max)]
            dfUpFiltered2 = dfUpFiltered[dfUpFiltered['alnSize-up'] == dfUpFiltered.groupby(['ID'])['alnSize-up'].transform(max)]                
            dfUpFiltered = dfUpFiltered2
            dfUpFiltered = dfUpFiltered.set_index(['ID'])

        # Look for the downstream flank in the sequencing data.
        ftemp_name = 'temp-flank-file.fasta'

        with open(ftemp_name, "w") as ftemp:
            flankDown = TE_dict_ref[key][2].seq
            ftemp.write(">"+key+"\n"+str(flankDown)+"\n")
        l = len(flankDown) * (float(e)/100)
        l = int(l)
        output = subprocess.check_output(last_cmd, shell=True).split('\n')

        # For each instance where an alignment is found:
        # Take the first coordinate of the alignment and store it together with the ID. This is where the TE sequence stops in the query.
        dictDOWN = {}
        output = output[:-1]
        count = 0
        dfDOWN = pd.DataFrame(columns = headerDown)

        for line in output:
            if not '#' in line:
                line_data = line.split('\t')
                ID_line = line_data[1]
                endTE = int(line_data[2])
                score = int(line_data[0])
                alnSize = int(line_data[3])
                dictDOWN[count] = ID_line, endTE, score, alnSize
                count += 1
        if len(dictDOWN) > 0:
            dfDOWN = pd.DataFrame.from_dict(dictDOWN).transpose() 
            dfDOWN.columns = headerDown

        dfDOWNnan = pd.DataFrame(columns = headerDown)
        dfDOWNnotNaN = pd.DataFrame(columns = headerDown)         
        dfDOWNnan = dfDOWN[dfDOWN['alnSize-down'].isnull()]
        dfDOWNnotNaN = dfDOWN[dfDOWN['alnSize-down'].notnull()]
        dfDownFiltered = pd.DataFrame(columns = headerDown)

        if len(dfDOWNnotNaN) > 0:
            dfDownFiltered = dfDOWNnotNaN[dfDOWNnotNaN['score-down'] == dfDOWNnotNaN.groupby(['ID'])['score-down'].transform(max)]
            dfDownFiltered2 = dfDownFiltered[dfDownFiltered['alnSize-down'] == dfDownFiltered.groupby(['ID'])['alnSize-down'].transform(max)]
            dfDownFiltered = dfDownFiltered2
            dfDownFiltered = dfDownFiltered.set_index(['ID'])

        # Merge dataframes and filter out rows that have NaN (only start or stop is present).
        combinedHeader = headerUp + headerDown[1:]
        dfMerged = pd.DataFrame(columns = combinedHeader)
        concat = False

        if len(dfUpFiltered) == 0 and len(dfDownFiltered) > 0:
            for index, row in dfDownFiltered.iterrows():
                dfTemp = pd.DataFrame([[index, -1, np.nan, np.nan]], columns = headerUp)
                dfTemp = dfTemp.set_index('ID')
                dfUpFiltered = pd.concat([dfTemp, dfUpFiltered]) 

        if len(dfDownFiltered) == 0 and len(dfUpFiltered) > 0:
            for index, row in dfUpFiltered.iterrows():
                tmp = [{'ID':index, 'stop':-1, 'score-down':np.nan, 'alnSize-down':np.nan}]
                dfTemp = pd.DataFrame(tmp)#[[index, -1, np.nan, np.nan]], columns = headerDown)
                dfTemp = dfTemp.set_index('ID')
                if len(dfDownFiltered) == 0:
                    dfDownFiltered = dfTemp
                else:
                    dfDownFiltered = pd.concat([dfTemp, dfDownFiltered])


        dfMerged = dfUpFiltered.merge(dfDownFiltered, left_index = True, right_index = True)
        dfMergedNaN = pd.concat([dfMerged[dfMerged['alnSize-up'].isnull()], dfMerged[dfMerged['alnSize-down'].isnull()], dfDOWN[dfDOWN['alnSize-down'].isnull()],dfUP[dfUP['alnSize-up'].isnull()]]) 

        # Determine if there is a present/absent polymorphism.
        if len(dfMerged) > 0:
            dfMerged['P/A poly'] = 0
            for index, row in dfMerged.iterrows():
                presentAbsent = 0
                if row['start'] < 0 or row['stop'] < 0:
                    row['Query_ID'] = index
                    row['TE_ID'] = key
                    one_notAligned = pd.concat([one_notAligned, row]).transpose()
                    dfMerged.loc[index, 'start'] = np.nan
                else:
                    if int(row['start']) > int(row['stop']):
                        temp = int(row['stop'])
                        row['stop'] = int(row['start'])
                        row['start'] = temp                   
                    if int(row['start']) == int(row['stop']):
                        presentAbsent += 1
                    if (int(row['start']) + int(row['alnSize-up']) + int(row['alnSize-down']) + 1 == int(row['stop'])) or (int(row['start']) + int(row['alnSize-up']) + int(row['alnSize-down']) == int(row['stop'])):
                        presentAbsent += 2
                dfMerged.loc[index, 'P/A poly'] = presentAbsent

            dfMerged = dfMerged[dfMerged.start.notna()]
            dfMerged = dfMerged[dfMerged.stop.notna()]

            # Make a file containing the the WGS sequence ID + the WGS sequence.
            partSeq = {}        
            #wgs_dict = SeqIO.to_dict(SeqIO.parse(open(fa), "fasta"))
            number = 0
            for index, row in dfMerged.iterrows():
                if int(row['start']) > int(row['stop']):
                    temp = int(row['stop'])
                    row['stop'] = int(row['start'])
                    row['start'] = temp
                start = int(row['start'])
                stop = int(row['stop'])
                ID = index
                if index in wgs_dict:
                    # Need to make a string, otherwise it will still give all components of the BioSeq'object'
                    wholeSeq = str(wgs_dict[index].seq)
                    partSeq[number] = ID, wholeSeq[int(start):int(stop)], row['start'], row['score-up'], row['alnSize-up'], row['stop'], row['score-down'], row['alnSize-down'], row['P/A poly']
                    number += 1

            if len(partSeq) > 0:

                partSeqDF = pd.DataFrame.from_dict(partSeq).transpose()
                partSeqDF.columns = ['ID', 'sequence', 'start', 'score-up', 'alnSize-up', 'stop', 'score-down', 'alnSize-down', 'P/A poly']
                # FILTER sequences on the presence of 'N's.
                for index,row in partSeqDF.iterrows():
                    sequence = row['sequence']
                    length = len(sequence)
                    nCount = 0.0
                    if length != 0:
                        for ch in sequence:
                            if ch == 'N':
                                nCount += 1
                        percentage = nCount / length
                        if percentage >= 0.01:
                            # Change sequence to NaN
                            partSeqDF.loc[index]['sequence'] = np.nan
                filtered_sequenceDF = partSeqDF.dropna(axis=0, subset = ['sequence'])
                filtered_sequenceDF['TE ID'] = key
                filtered_sequenceDF['Ref_Seq'] = str(TE_dict_ref[key][0].seq)
                total_sequence = total_sequence.append(filtered_sequenceDF, ignore_index=False)

    excelOutput = resultsfile + '_not_aligned.xlsx'

    writer = pd.ExcelWriter(excelOutput, engine='xlsxwriter')

    workbook = writer.book
    italic = workbook.add_format({'italic': True})
    bold = workbook.add_format({'bold': True})

    one_notAligned.to_excel(writer, sheet_name = 'One-NotAligned', startrow = 5)
    worksheet = writer.sheets['One-NotAligned']
    worksheet.write_rich_string('A1', bold, 'TEs for which one of the flanking regions did not align.')

    dfMergedNaN.to_excel(writer, sheet_name = 'Merged-NotAligned', startrow = 5)
    worksheet = writer.sheets['Merged-NotAligned']
    worksheet.set_column('A:I', 10)
    worksheet.set_column('B:B', 45)
    worksheet.set_column('F:F', 45)

    worksheet.write_rich_string('A1', bold, 'TEs for which flank(s) did not align.')

    writer.save()
    #print total_sequence
    return total_sequence

    
# FUNCTION to grep sequence of interest out of 'sequencing' file.   
def alignLocationparallel(run, TE_dict_ref, e, fa, key, wgs_dict, headerUp, headerDown, output_name, lastal_version):

    one_notAligned = pd.DataFrame()
    filtered_sequenceDF = pd.DataFrame()
    global to_delete

    if run == -1:
        ftemp_name = 'temp-flank-file.fasta'
    else:
        ftemp_name = output_name+'_temp-flank-file-'+str(run)+'.fasta'
    with open(ftemp_name, "w") as ftemp:
        # Look for the upstream flank in the sequencing data.
        flankUp = TE_dict_ref[key][1].seq
        ftemp.write(">"+key+"\n"+str(flankUp)+"\n")

    dictUP = {}
    # filter lastal output to obtain best match. Using e to determine percentage of bases that must match.
    l = len(flankUp) * (float(e)/100)
    l = int(l)
    if lastal_version < 750:
        last_cmd = "lastal -e "+str(l) +" -f 0 "+fa+" "+ftemp_name
    else:
        last_cmd = "lastal -e "+str(l) +" -f tab "+fa+" "+ftemp_name
    #print last_cmd
    output = subprocess.check_output(last_cmd, shell=True).split('\n')  
    # Don't print lines that start with "#" and the last line of the output.
    # Take the first coordinate of the alignment plus the length of the alignment and store it. This is where the sequence of the TE starts in the query.
    output = output[:-1]
    count = 0

    dfUpFiltered = pd.DataFrame(columns = headerUp)
    dfDownFiltered = pd.DataFrame(columns = headerDown)
    dictUP[count] = key, -1 , np.nan, np.nan
    dfUP = pd.DataFrame(columns = headerUp)

    for line in output:
        if not '#' in line:
            line_data = line.split('\t')
            ID_line = line_data[1]
            beginTE = int(line_data[2]) + int(line_data[3])
            score = int(line_data[0])
            alnSize = int(line_data[3]) 
            dictUP[count] = ID_line, beginTE, score, alnSize
            count += 1   
    if len(dictUP) > 0:
        dfUP = pd.DataFrame.from_dict(dictUP).transpose()
        dfUP.columns = headerUp

    dfUPnan = pd.DataFrame(columns = headerUp)
    dfUPnotNaN = pd.DataFrame(columns = headerUp)
    dfUPnan = dfUP[dfUP['alnSize-up'].isnull()]
    dfUPnotNaN = dfUP[dfUP['alnSize-up'].notnull()]
    dfUpFiltered = pd.DataFrame(columns = headerUp)

    if len(dfUPnotNaN) > 0:
        dfUPnotNaN.columns = headerUp
        dfUpFiltered = dfUPnotNaN[dfUPnotNaN['score-up'] == dfUPnotNaN.groupby(['ID'])['score-up'].transform(max)]
        dfUpFiltered2 = dfUpFiltered[dfUpFiltered['alnSize-up'] == dfUpFiltered.groupby(['ID'])['alnSize-up'].transform(max)]          
        dfUpFiltered = dfUpFiltered2
        dfUpFiltered = dfUpFiltered.set_index(['ID'])

    # Look for the downstream flank in the sequencing data.
    with open(ftemp_name, "w") as ftemp:
        flankDown = TE_dict_ref[key][2].seq
        ftemp.write(">"+key+"\n"+str(flankDown)+"\n")

    l = len(flankDown) * (float(e)/100)
    l = int(l)
    output = subprocess.check_output(last_cmd, shell=True).split('\n')
    to_delete.append(ftemp_name)
    # For each instance where an alignment is found:
        # Take the first coordinate of the alignment and store it together with the ID. This is where the TE sequence stops in the query.
    dictDOWN = {}
    output = output[:-1]
    count = 0
    dfDOWN = pd.DataFrame(columns = headerDown)  
    
    for line in output:
        if not '#' in line:
            line_data = line.split('\t')
            ID_line = line_data[1]
            endTE = int(line_data[2])
            score = int(line_data[0])
            alnSize = int(line_data[3])
            dictDOWN[count] = ID_line, endTE, score, alnSize
            count += 1
    if len(dictDOWN) > 0:
        dfDOWN = pd.DataFrame.from_dict(dictDOWN).transpose() 
        dfDOWN.columns = headerDown

    dfDOWNnan = pd.DataFrame(columns = headerDown)
    dfDOWNnotNaN = pd.DataFrame(columns = headerDown)      
    dfDOWNnan = dfDOWN[dfDOWN['alnSize-down'].isnull()]
    dfDOWNnotNaN = dfDOWN[dfDOWN['alnSize-down'].notnull()]
    dfDownFiltered = pd.DataFrame(columns = headerDown)

    if len(dfDOWNnotNaN) > 0:
        dfDownFiltered = dfDOWNnotNaN[dfDOWNnotNaN['score-down'] == dfDOWNnotNaN.groupby(['ID'])['score-down'].transform(max)]
        dfDownFiltered2 = dfDownFiltered[dfDownFiltered['alnSize-down'] == dfDownFiltered.groupby(['ID'])['alnSize-down'].transform(max)]
        dfDownFiltered = dfDownFiltered2
        dfDownFiltered = dfDownFiltered.set_index(['ID'])

    # Merge dataframes and filter out rows that have NaN (only start or stop is present).
    combinedHeader = headerUp + headerDown[1:]
    dfMerged = pd.DataFrame(columns = combinedHeader)
    concat = False

    if len(dfUpFiltered) == 0 and len(dfDownFiltered) > 0:
        for index, row in dfDownFiltered.iterrows():
            dfTemp = pd.DataFrame([[index, -1, np.nan, np.nan]], columns = headerUp)
            dfTemp = dfTemp.set_index('ID')
            dfUpFiltered = dfTemp#pd.concat([dfTemp, dfUpFiltered]) 

    if len(dfDownFiltered) == 0 and len(dfUpFiltered) > 0:
        for index, row in dfUpFiltered.iterrows():
            tmp = [{'ID':index, 'stop':-1, 'score-down':np.nan, 'alnSize-down':np.nan}]
            dfTemp = pd.DataFrame(tmp)#[[index, -1, np.nan, np.nan]], columns = headerDown)
            dfTemp = dfTemp.set_index('ID')
            dfDownFiltered = dfTemp
            # if len(dfDownFiltered) == 0:
                # dfDownFiltered = dfTemp
            # else:
                # dfDownFiltered = pd.concat([dfTemp, dfDownFiltered])
            

    dfMerged = dfUpFiltered.merge(dfDownFiltered, left_index = True, right_index = True)
    dfMergedNaN = pd.concat([dfMerged[dfMerged['alnSize-up'].isnull()], dfMerged[dfMerged['alnSize-down'].isnull()], dfDOWN[dfDOWN['alnSize-down'].isnull()],dfUP[dfUP['alnSize-up'].isnull()]]) 

        # Determine if there is a present/absent polymorphism.
    if len(dfMerged) > 0:
        dfMerged['P/A poly'] = 0
        for index, row in dfMerged.iterrows():
            presentAbsent = 0
            if row['start'] < 0 or row['stop'] < 0:
                row['Query_ID'] = index
                row['TE_ID'] = key
                one_notAligned = pd.concat([one_notAligned, row], ignore_index=True).transpose()
                dfMerged.loc[index, 'start'] = np.nan

            else:
                if int(row['start']) > int(row['stop']):
                    temp = int(row['stop'])
                    row['stop'] = int(row['start'])
                    row['start'] = temp                
                if int(row['start']) == int(row['stop']):
                    presentAbsent += 1
                if (int(row['start']) + int(row['alnSize-up']) + int(row['alnSize-down']) + 1 == int(row['stop'])) or (int(row['start']) + int(row['alnSize-up']) + int(row['alnSize-down']) == int(row['stop'])):
                    presentAbsent += 2
            dfMerged.loc[index, 'P/A poly'] = presentAbsent

        dfMerged = dfMerged[dfMerged.start.notna()]
        dfMerged = dfMerged[dfMerged.stop.notna()]

        # Make a file containing the the WGS sequence ID + the WGS sequence.
        partSeq = {}     
        number = 0
        for index, row in dfMerged.iterrows():
            if int(row['start']) > int(row['stop']):
                temp = int(row['stop'])
                row['stop'] = int(row['start'])
                row['start'] = temp
            start = int(row['start'])
            stop = int(row['stop'])
            ID = index
            if index in wgs_dict:
                # Need to make a string, otherwise it will still give all components of the BioSeq'object'
                wholeSeq = str(wgs_dict[index].seq)
                partSeq[number] = ID, wholeSeq[int(start):int(stop)], row['start'], row['score-up'], row['alnSize-up'], row['stop'], row['score-down'], row['alnSize-down'], row['P/A poly']
                number += 1

        #print partSeq
        if len(partSeq) > 0:

            partSeqDF = pd.DataFrame.from_dict(partSeq).transpose()
            partSeqDF.columns = ['ID', 'sequence', 'start', 'score-up', 'alnSize-up', 'stop', 'score-down', 'alnSize-down', 'P/A poly']

            # FILTER sequences on the presence of 'N's.
            for index,row in partSeqDF.iterrows():
                sequence = row['sequence']
                length = len(sequence)
                nCount = 0.0
                if length != 0:
                    for ch in sequence:
                        if ch == 'N':
                            nCount += 1
                    percentage = nCount / length

                    if percentage >= 0.01:
                        # Change sequence to NaN
                        partSeqDF.loc[index]['sequence'] = np.nan
            # Filter out rows that contain 'NaN' in the 'sequence' column.
            filtered_sequenceDF = partSeqDF.dropna(axis=0, subset = ['sequence'])
            filtered_sequenceDF['TE ID'] = key
            filtered_sequenceDF['Ref_Seq'] = str(TE_dict_ref[key][0].seq)

    #print [[dfMergedNaN], [one_notAligned], [filtered_sequenceDF]]
    return [[dfMergedNaN], [one_notAligned], [filtered_sequenceDF]]#list(dfMergedNaN, one_notAligned, filtered_sequenceDF)
            

# FUNCTION to align reference sequence with query sequence.
def blastLocation(la_results):
 # per transposable element:
    #   compare each sequence present for this TE in la_results with the reference TE sequence.
    print "*** Comparing reference with database sequences: " + str(time.asctime( time.localtime(time.time()) ))
    blast_results = []
    for index,row in la_results.iterrows():
        ftemp_query = 'temp-query-file.fasta'
        with open(ftemp_query, "w") as ftempQ:
            query = row['sequence']
            queryID = row['ID'] 
            presentAbsent = row['P/A poly']
            #look for the upstream flank in the sequencing data
            ftempQ.write(">"+queryID+"\n"+str(query)+"\n")
        ftemp_ref = 'temp-ref-file.fasta'

        with open(ftemp_ref, "w") as ftempR:
            ref = row['Ref_Seq']
            refID = row['TE ID']
            ftempR.write(">"+refID+"\n"+str(ref)+"\n")
        gaplist_query = []
        gaplist_ref = []
    
        if len(str(row['Ref_Seq'])) != 0 and len(str(row['sequence'])) != 0:
            outputfile_needle = 'outputfile.needle'
            subprocess.call(['needle', '-asequence', ftemp_ref, '-bsequence', ftemp_query,  '-gapopen', '10.0', '-gapextend', '0.5', '-outfile', outputfile_needle])
            with open(outputfile_needle, 'r') as output:
                querySeq = ''
                refSeq = ''
                lines = output.readlines()
                count = 0
                for line in lines:
                    if not line.startswith('#'):
                        line_data = [x for x in line.split(' ') if x != '']
                        if line_data[0][0] != '|' and len(line_data) > 1 and line_data[0][0] != '.':
                            line_data = line_data[2]
                            if count % 2 == 0:
                                refSeq += str(line_data)
                            else: 
                                querySeq += str(line_data)
                            count += 1
                start = -1
                for ch in range(len(refSeq)):
                    if refSeq[ch] == '-':
                        if start != -1:
                            end = ch + 1
                        else:
                            start = ch
                            end = ch + 1
                    else:
                        if start >= 0:
                            gaplist_ref.append([start,end])
                        start = -1
                start = -1
                end = 0
                for ch in range(len(querySeq)):
                    if querySeq[ch] == '-':
                        if start != -1:
                            end = ch + 1
                        else:
                            start = ch
                            end = ch + 1
                    else:
                        if start >= 0:
                            gaplist_query.append([start,end])
                        start = -1
                        end = 0    
        TE_ID = row['TE ID']
        lengthRef = len(ref)
        lengthQuery = len(query)
        results_list = [TE_ID, queryID, gaplist_ref, gaplist_query, presentAbsent, lengthRef, lengthQuery]
        blast_results.append(results_list)

    return blast_results
    
    # FUNCTION to align reference sequence with query sequence.
def blastLocationParallel(index,row,ooutput_name,query_length_filter):
 # per transposable element:
    #   compare each sequence present for this TE in la_results with the reference TE sequence.

    global to_delete
    outputfile_needle = "empty"
    
    ftemp_query = ooutput_name+'_temp-query-file_'+str(index)+'.fasta'
    with open(ftemp_query, "w") as ftempQ:
        query = row['sequence']
        queryID = row['ID'] 
        presentAbsent = row['P/A poly']
        #look for the upstream flank in the sequencing data
        ftempQ.write(">"+queryID+"\n"+str(query)+"\n")

    ftemp_ref = ooutput_name+'_temp-ref-file_'+str(index)+'.fasta'
    with open(ftemp_ref, "w") as ftempR:
        ref = row['Ref_Seq']
        refID = row['TE ID']
        ftempR.write(">"+refID+"\n"+str(ref)+"\n")
        
    gaplist_query = []
    gaplist_ref = []

    if len(str(row['Ref_Seq'])) != 0 and len(str(row['sequence'])) != 0:
        outputfile_needle = ooutput_name+'_outputfile_'+str(index)+'.needle'
        ## if use stretcher, link between two basepairs is indicated by : instead of | with needle.
        # subprocess.call(['needle', '-asequence', ftemp_ref, '-bsequence', ftemp_query,  '-gapopen', '10.0', '-gapextend', '0.5', '-outfile', outputfile_needle])
        subprocess.call(['stretcher', '-asequence', ftemp_ref, '-bsequence', ftemp_query,  '-gapopen', '16', '-gapextend', '4', '-outfile', outputfile_needle])
        with open(outputfile_needle, 'r') as output:
            querySeq = ''
            refSeq = ''
            lines = output.readlines()
            count = 0
            for line in lines:
                if not line.startswith('#'):
                    line_data = [x for x in line.split(' ') if x != '']
                    if line_data[0][0] != '|' and len(line_data) > 1 and line_data[0][0] != '.' and line_data[0][0] != ':' :
                        line_data = line_data[1] ## with needle is has to be 2, with stretcher 1
                        if count % 2 == 0:
                            refSeq += str(line_data)
                        else: 
                            querySeq += str(line_data)
                        count += 1
            start = -1
            for ch in range(len(refSeq)):
                if refSeq[ch] == '-':
                    if start != -1:
                        end = ch + 1
                    else:
                        start = ch
                        end = ch + 1
                else:
                    if start >= 0:
                        gaplist_ref.append([start,end])
                    start = -1
            start = -1
            end = 0
            for ch in range(len(querySeq)):
                if querySeq[ch] == '-':
                    if start != -1:
                        end = ch + 1
                    else:
                        start = ch
                        end = ch + 1
                else:
                    if start >= 0:
                        gaplist_query.append([start,end])
                    start = -1
                    end = 0    
    TE_ID = row['TE ID']
    lengthRef = len(ref)
    lengthQuery = len(query)
    if lengthQuery < query_length_filter:
        results_list = [TE_ID, queryID, gaplist_ref, gaplist_query, presentAbsent, lengthRef, lengthQuery]
    else:
        results_list = []
    to_delete.append(ftemp_query)
    to_delete.append(ftemp_ref)
    if not outputfile_needle == "empty":
        to_delete.append(outputfile_needle)
    return results_list


### check for locations and size of deletion:
# Location: take mean of start+stop deletion and devide it by size of reference (length sequence).
# Size: take stop-start of deletion.
def analysis(results, resultsfile):
    results['Query Del Size'] = 0
    results['Ref Del Size'] = 0
    results['Query Del Location'] = 'NaN'
    results['Ref Del Location'] = 'NaN'
    for index, row in results.iterrows():
        if row['Gaplist Ref'] is None:
            gaplist_ref = []
        else:
            gaplist_ref = row['Gaplist Ref']
        if row['Gaplist Query'] is None:
            gaplist_query = []
        else:
            gaplist_query = row['Gaplist Query']
        deletionList = np.nan
        locationList = np.nan
        if len(gaplist_ref) > 0:
            deletionList = []
            locationList = []
            for i in gaplist_ref:
                start = i[0]
                stop = i[1]
                deletion = stop - start
                lengthRef = row['Ref length']
                lengthQuery = row['Query length']
                mean = ''
                mean = start, stop
                meanDeletion = np.mean(mean)
                deletionLocation = meanDeletion / max(lengthQuery, lengthRef)
                deletionLocation = np.around(deletionLocation, decimals = 3)
                deletionList.append([str(deletion)])
                locationList.append([deletionLocation])
        results.loc[index, 'Ref Del Location'] = locationList
        results.loc[index, 'Ref Del Size'] = deletionList

        deletionList = np.nan
        locationList = np.nan
        if len(gaplist_query) > 0:
            deletionList = []
            locationList = []
            for i in gaplist_query:
                start = i[0]
                stop = i[1]
                deletion = stop - start
                lengthQuery = row['Query length']
                lengthRef = row['Ref length']
                mean = ''
                mean = start, stop
                meanDeletion = np.mean(mean)
                deletionLocation = meanDeletion / max(lengthQuery,lengthRef)
                deletionLocation = np.around(deletionLocation, decimals = 3)
                deletionList.append([str(deletion)])
                locationList.append([deletionLocation])
        results.loc[index, 'Query Del Location'] = locationList
        results.loc[index, 'Query Del Size'] = deletionList

    results = results.reindex_axis(['TE ID', 'Query ID', 'P/A poly', 'Gaplist Ref', 'Ref Del Size', 'Ref Del Location', 'Ref length', 'Gaplist Query', 'Query Del Size', 'Query Del Location', 'Query length'], axis=1)

    # Remove lines that have more than ... deletions/insertions
    gapFilter = 25
    for index, row in results.iterrows():
        if (row['Gaplist Ref'] is not None) and  (row['Gaplist Query'] is not None):
            if (len(row['Gaplist Ref']) > gapFilter) or (len(row['Gaplist Query']) > gapFilter):
                results.loc[index, 'Gaplist Ref'] = 'MA'
                results.loc[index, 'Ref Del Size'] = 'MA'
                results.loc[index, 'Ref Del Location'] = 'MA'
                results.loc[index, 'Gaplist Query'] = 'MA'
                results.loc[index, 'Query Del Size'] = 'MA'
                results.loc[index, 'Query Del Location'] = 'MA'


    excelOutput = resultsfile + '.xlsx'
    print "Output is stored at: " + excelOutput

    writer = pd.ExcelWriter(excelOutput, engine='xlsxwriter')
    results.to_excel(writer, sheet_name = 'Results-TEpoly-analysis', startrow = 6)

    workbook = writer.book
    worksheet = writer.sheets['Results-TEpoly-analysis']

    italic = workbook.add_format({'italic': True})
    bold = workbook.add_format({'bold': True})

    worksheet.set_column('A:A', 5)
    worksheet.set_column('B:B', 45)
    worksheet.set_column('C:C', 16)
    worksheet.set_column('D:H', 10)
    worksheet.set_column('G:G', 15)
    worksheet.set_column('H:L', 13)
    worksheet.set_column('J:J', 17)

    worksheet.write_rich_string('A1', bold, 'Results of the TE polymorphism analysis')
    worksheet.write_rich_string('A2', italic, 'Interpret with caution')
    worksheet.write_rich_string('A4', italic, 'P/A poly: ', bold, '0', ' = no present/absent polymorphism, ', bold, '1', ' = full deletion, ', bold, '2', ' = non-ref TE.')
    worksheet.write_rich_string('A5', italic, 'MA = misaligned sequence')
       

    writer.save()

    return results



if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='blub')
    parser.add_argument('-i', required=True, help='input, should be a .bed file containing coordinates of elements in the format: chrom, start, stop, name, score, strand')
    parser.add_argument('-r', required=True, help='input reference file')
    parser.add_argument('-s', required=True, help='sequencing fasta file')
    parser.add_argument('-t', required=False, default = 0, help='threads to use for lastdb. Default = max available')
    parser.add_argument('-o', required=False, default = 'results-TEpoly', help='output file')
    parser.add_argument('-p', required=False, default=500, help='padding in bp, which is going to be added to the coordinates as flanking regions. Default = 500')
    parser.add_argument('-g', required=True, help='chrom.sizes file, containing the sizes of the chromosomes')
    parser.add_argument('-e', required=False, default=30, help='lastal: minimum score for gapped alignments in percentage (1-100). Default = 30 (30 percent)')
    parser.add_argument('--threshold', required=False, default=0, help='threshold for filtering alignment of flanks. If more than -t number of alignments are found for the flank, filter on score. Default = 0')
    parser.add_argument('-f', required=False, default=False, help='Set to true is filtering of flank alignments on score is required. Default = False')
    parser.add_argument('--cores', required=False, default=1, help='Set the number of cores used for multithreading, default = 1')
    parser.add_argument('--qc', required=False, default=10000, help='Set the maximum length of the query which will be returned by the blasting step. default=10000')
    args = parser.parse_args()

    main(args)


